{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests_html\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-127-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入“公众号”参数\n",
    "公众号 = \"南方都市报\"\n",
    "# 指定内容输出的位置\n",
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 自动化登录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"2295447991@qq.com\", \"password\": \"Huang1zi@\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('.//input[@name=\"account\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 清空密码input\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 寻找选单"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 展开\n",
    "element = driver.find_element_by_xpath('//*[@id=\"mp_header\"]/div/div/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击图文素材\n",
    "element = driver.find_element_by_xpath('//*[@id=\"menuBar\"]/li[2]/ul/li[1]') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 \"+\"  \n",
    "element = driver.find_element_by_xpath('//i[@class=\"weui-desktop-card__icon-add\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新的图文\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div/div[2]/ul/li[1]') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-1434A1DFF314595F962F45A45DAB4DA7', 'CDwindow-ACB709E6FA07A87843C7CF77DCA09D8A']\n"
     ]
    }
   ],
   "source": [
    "# 检查窗口信息\n",
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-141-704fdf3805c1>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击超链接\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_editor_insertlink\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择其他公众号\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div/button')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "# input 输入关键词\n",
    "element = driver.find_element_by_xpath('//input[@placeholder=\"输入文章来源的公众号名称或微信号，回车进行搜索\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrzDWicH7JWJpiacbysXllOEbCSNUJCZxEZEhgP51W1Y8om1ZHyxlfMw7dBSw2IgDWCjshddggeiaFQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">南方都市报</strong> <i class=\"inner_link_account_wechat\">微信号：nddaily</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4jfPPzxI6nWsXQdo458HkQRw6RfAWbBIPVT9NcibYTzYGQQn3l6Fs5dLwenuwBMyS03S28rOqg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">爱南方</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjMf7v2lhz06ZhFcdW38XdraMAfNB49dbqgibCzZx3E6xwDxa1WxaKlaHK3n0yWic6P6P3CibYwVwQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">都市报童</strong> <i class=\"inner_link_account_wechat\">微信号：dushibaotong</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0XLwTteT1ibM39FwW8ylgEFghOuOd4SWwCX3IdDWoVaHaErYbIWU4ic1ibXSI4DQVGqZ9g6pS0Vg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">正點观影</strong> <i class=\"inner_link_account_wechat\">微信号：nd_ent</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>南方都市报</td>\n",
       "      <td>微信号：nddaily</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrz...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>爱南方</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4j...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>都市报童</td>\n",
       "      <td>微信号：dushibaotong</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>正點观影</td>\n",
       "      <td>微信号：nd_ent</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  nickname            wechat  \\\n",
       "0    南方都市报       微信号：nddaily   \n",
       "1      爱南方           微信号：未设置   \n",
       "2     都市报童  微信号：dushibaotong   \n",
       "3     正點观影        微信号：nd_ent   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrz...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4j...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjM...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0...  "
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)\n",
    "df_account"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取文章链接和正文"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrzDWicH7JWJpiacbysXllOEbCSNUJCZxEZEhgP51W1Y8om1ZHyxlfMw7dBSw2IgDWCjshddggeiaFQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">南方都市报</strong> <i class=\"inner_link_account_wechat\">微信号：nddaily</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1336]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n",
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "# 改页码上限\n",
    "print (l_e_int[0]==l_e_int[-1])\n",
    "print(pages[0:55])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 厉遍循环\n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(45+120*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages[0:55])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "41  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "42  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "43  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "44  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "45  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "46  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "47  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "48  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "49  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "50  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "51  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "52  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "53  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "54  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "55  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "55\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [html_snippets]\n",
       "Index: []"
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[56,\n",
       " 57,\n",
       " 58,\n",
       " 59,\n",
       " 60,\n",
       " 61,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 84,\n",
       " 85,\n",
       " 86,\n",
       " 87,\n",
       " 88,\n",
       " 89,\n",
       " 90,\n",
       " 91,\n",
       " 92,\n",
       " 93,\n",
       " 94,\n",
       " 95,\n",
       " 96,\n",
       " 97,\n",
       " 98,\n",
       " 99,\n",
       " 100,\n",
       " 101,\n",
       " 102,\n",
       " 103,\n",
       " 104,\n",
       " 105,\n",
       " 106,\n",
       " 107,\n",
       " 108,\n",
       " 109,\n",
       " 110,\n",
       " 111,\n",
       " 112,\n",
       " 113,\n",
       " 114,\n",
       " 115,\n",
       " 116,\n",
       " 117,\n",
       " 118,\n",
       " 119,\n",
       " 120,\n",
       " 121,\n",
       " 122,\n",
       " 123,\n",
       " 124,\n",
       " 125,\n",
       " 126,\n",
       " 127,\n",
       " 128,\n",
       " 129,\n",
       " 130,\n",
       " 131,\n",
       " 132,\n",
       " 133,\n",
       " 134,\n",
       " 135,\n",
       " 136,\n",
       " 137,\n",
       " 138,\n",
       " 139,\n",
       " 140,\n",
       " 141,\n",
       " 142,\n",
       " 143,\n",
       " 144,\n",
       " 145,\n",
       " 146,\n",
       " 147,\n",
       " 148,\n",
       " 149,\n",
       " 150,\n",
       " 151,\n",
       " 152,\n",
       " 153,\n",
       " 154,\n",
       " 155,\n",
       " 156,\n",
       " 157,\n",
       " 158,\n",
       " 159,\n",
       " 160,\n",
       " 161,\n",
       " 162,\n",
       " 163,\n",
       " 164,\n",
       " 165,\n",
       " 166,\n",
       " 167,\n",
       " 168,\n",
       " 169,\n",
       " 170,\n",
       " 171,\n",
       " 172,\n",
       " 173,\n",
       " 174,\n",
       " 175,\n",
       " 176,\n",
       " 177,\n",
       " 178,\n",
       " 179,\n",
       " 180,\n",
       " 181,\n",
       " 182,\n",
       " 183,\n",
       " 184,\n",
       " 185,\n",
       " 186,\n",
       " 187,\n",
       " 188,\n",
       " 189,\n",
       " 190,\n",
       " 191,\n",
       " 192,\n",
       " 193,\n",
       " 194,\n",
       " 195,\n",
       " 196,\n",
       " 197,\n",
       " 198,\n",
       " 199,\n",
       " 200,\n",
       " 201,\n",
       " 202,\n",
       " 203,\n",
       " 204,\n",
       " 205,\n",
       " 206,\n",
       " 207,\n",
       " 208,\n",
       " 209,\n",
       " 210,\n",
       " 211,\n",
       " 212,\n",
       " 213,\n",
       " 214,\n",
       " 215,\n",
       " 216,\n",
       " 217,\n",
       " 218,\n",
       " 219,\n",
       " 220,\n",
       " 221,\n",
       " 222,\n",
       " 223,\n",
       " 224,\n",
       " 225,\n",
       " 226,\n",
       " 227,\n",
       " 228,\n",
       " 229,\n",
       " 230,\n",
       " 231,\n",
       " 232,\n",
       " 233,\n",
       " 234,\n",
       " 235,\n",
       " 236,\n",
       " 237,\n",
       " 238,\n",
       " 239,\n",
       " 240,\n",
       " 241,\n",
       " 242,\n",
       " 243,\n",
       " 244,\n",
       " 245,\n",
       " 246,\n",
       " 247,\n",
       " 248,\n",
       " 249,\n",
       " 250,\n",
       " 251,\n",
       " 252,\n",
       " 253,\n",
       " 254,\n",
       " 255,\n",
       " 256,\n",
       " 257,\n",
       " 258,\n",
       " 259,\n",
       " 260,\n",
       " 261,\n",
       " 262,\n",
       " 263,\n",
       " 264,\n",
       " 265,\n",
       " 266,\n",
       " 267,\n",
       " 268,\n",
       " 269,\n",
       " 270,\n",
       " 271,\n",
       " 272,\n",
       " 273,\n",
       " 274,\n",
       " 275,\n",
       " 276,\n",
       " 277,\n",
       " 278,\n",
       " 279,\n",
       " 280,\n",
       " 281,\n",
       " 282,\n",
       " 283,\n",
       " 284,\n",
       " 285,\n",
       " 286,\n",
       " 287,\n",
       " 288,\n",
       " 289,\n",
       " 290,\n",
       " 291,\n",
       " 292,\n",
       " 293,\n",
       " 294,\n",
       " 295,\n",
       " 296,\n",
       " 297,\n",
       " 298,\n",
       " 299,\n",
       " 300,\n",
       " 301,\n",
       " 302,\n",
       " 303,\n",
       " 304,\n",
       " 305,\n",
       " 306,\n",
       " 307,\n",
       " 308,\n",
       " 309,\n",
       " 310,\n",
       " 311,\n",
       " 312,\n",
       " 313,\n",
       " 314,\n",
       " 315,\n",
       " 316,\n",
       " 317,\n",
       " 318,\n",
       " 319,\n",
       " 320,\n",
       " 321,\n",
       " 322,\n",
       " 323,\n",
       " 324,\n",
       " 325,\n",
       " 326,\n",
       " 327,\n",
       " 328,\n",
       " 329,\n",
       " 330,\n",
       " 331,\n",
       " 332,\n",
       " 333,\n",
       " 334,\n",
       " 335,\n",
       " 336,\n",
       " 337,\n",
       " 338,\n",
       " 339,\n",
       " 340,\n",
       " 341,\n",
       " 342,\n",
       " 343,\n",
       " 344,\n",
       " 345,\n",
       " 346,\n",
       " 347,\n",
       " 348,\n",
       " 349,\n",
       " 350,\n",
       " 351,\n",
       " 352,\n",
       " 353,\n",
       " 354,\n",
       " 355,\n",
       " 356,\n",
       " 357,\n",
       " 358,\n",
       " 359,\n",
       " 360,\n",
       " 361,\n",
       " 362,\n",
       " 363,\n",
       " 364,\n",
       " 365,\n",
       " 366,\n",
       " 367,\n",
       " 368,\n",
       " 369,\n",
       " 370,\n",
       " 371,\n",
       " 372,\n",
       " 373,\n",
       " 374,\n",
       " 375,\n",
       " 376,\n",
       " 377,\n",
       " 378,\n",
       " 379,\n",
       " 380,\n",
       " 381,\n",
       " 382,\n",
       " 383,\n",
       " 384,\n",
       " 385,\n",
       " 386,\n",
       " 387,\n",
       " 388,\n",
       " 389,\n",
       " 390,\n",
       " 391,\n",
       " 392,\n",
       " 393,\n",
       " 394,\n",
       " 395,\n",
       " 396,\n",
       " 397,\n",
       " 398,\n",
       " 399,\n",
       " 400,\n",
       " 401,\n",
       " 402,\n",
       " 403,\n",
       " 404,\n",
       " 405,\n",
       " 406,\n",
       " 407,\n",
       " 408,\n",
       " 409,\n",
       " 410,\n",
       " 411,\n",
       " 412,\n",
       " 413,\n",
       " 414,\n",
       " 415,\n",
       " 416,\n",
       " 417,\n",
       " 418,\n",
       " 419,\n",
       " 420,\n",
       " 421,\n",
       " 422,\n",
       " 423,\n",
       " 424,\n",
       " 425,\n",
       " 426,\n",
       " 427,\n",
       " 428,\n",
       " 429,\n",
       " 430,\n",
       " 431,\n",
       " 432,\n",
       " 433,\n",
       " 434,\n",
       " 435,\n",
       " 436,\n",
       " 437,\n",
       " 438,\n",
       " 439,\n",
       " 440,\n",
       " 441,\n",
       " 442,\n",
       " 443,\n",
       " 444,\n",
       " 445,\n",
       " 446,\n",
       " 447,\n",
       " 448,\n",
       " 449,\n",
       " 450,\n",
       " 451,\n",
       " 452,\n",
       " 453,\n",
       " 454,\n",
       " 455,\n",
       " 456,\n",
       " 457,\n",
       " 458,\n",
       " 459,\n",
       " 460,\n",
       " 461,\n",
       " 462,\n",
       " 463,\n",
       " 464,\n",
       " 465,\n",
       " 466,\n",
       " 467,\n",
       " 468,\n",
       " 469,\n",
       " 470,\n",
       " 471,\n",
       " 472,\n",
       " 473,\n",
       " 474,\n",
       " 475,\n",
       " 476,\n",
       " 477,\n",
       " 478,\n",
       " 479,\n",
       " 480,\n",
       " 481,\n",
       " 482,\n",
       " 483,\n",
       " 484,\n",
       " 485,\n",
       " 486,\n",
       " 487,\n",
       " 488,\n",
       " 489,\n",
       " 490,\n",
       " 491,\n",
       " 492,\n",
       " 493,\n",
       " 494,\n",
       " 495,\n",
       " 496,\n",
       " 497,\n",
       " 498,\n",
       " 499,\n",
       " 500,\n",
       " 501,\n",
       " 502,\n",
       " 503,\n",
       " 504,\n",
       " 505,\n",
       " 506,\n",
       " 507,\n",
       " 508,\n",
       " 509,\n",
       " 510,\n",
       " 511,\n",
       " 512,\n",
       " 513,\n",
       " 514,\n",
       " 515,\n",
       " 516,\n",
       " 517,\n",
       " 518,\n",
       " 519,\n",
       " 520,\n",
       " 521,\n",
       " 522,\n",
       " 523,\n",
       " 524,\n",
       " 525,\n",
       " 526,\n",
       " 527,\n",
       " 528,\n",
       " 529,\n",
       " 530,\n",
       " 531,\n",
       " 532,\n",
       " 533,\n",
       " 534,\n",
       " 535,\n",
       " 536,\n",
       " 537,\n",
       " 538,\n",
       " 539,\n",
       " 540,\n",
       " 541,\n",
       " 542,\n",
       " 543,\n",
       " 544,\n",
       " 545,\n",
       " 546,\n",
       " 547,\n",
       " 548,\n",
       " 549,\n",
       " 550,\n",
       " 551,\n",
       " 552,\n",
       " 553,\n",
       " 554,\n",
       " 555,\n",
       " 556,\n",
       " 557,\n",
       " 558,\n",
       " 559,\n",
       " 560,\n",
       " 561,\n",
       " 562,\n",
       " 563,\n",
       " 564,\n",
       " 565,\n",
       " 566,\n",
       " 567,\n",
       " 568,\n",
       " 569,\n",
       " 570,\n",
       " 571,\n",
       " 572,\n",
       " 573,\n",
       " 574,\n",
       " 575,\n",
       " 576,\n",
       " 577,\n",
       " 578,\n",
       " 579,\n",
       " 580,\n",
       " 581,\n",
       " 582,\n",
       " 583,\n",
       " 584,\n",
       " 585,\n",
       " 586,\n",
       " 587,\n",
       " 588,\n",
       " 589,\n",
       " 590,\n",
       " 591,\n",
       " 592,\n",
       " 593,\n",
       " 594,\n",
       " 595,\n",
       " 596,\n",
       " 597,\n",
       " 598,\n",
       " 599,\n",
       " 600,\n",
       " 601,\n",
       " 602,\n",
       " 603,\n",
       " 604,\n",
       " 605,\n",
       " 606,\n",
       " 607,\n",
       " 608,\n",
       " 609,\n",
       " 610,\n",
       " 611,\n",
       " 612,\n",
       " 613,\n",
       " 614,\n",
       " 615,\n",
       " 616,\n",
       " 617,\n",
       " 618,\n",
       " 619,\n",
       " 620,\n",
       " 621,\n",
       " 622,\n",
       " 623,\n",
       " 624,\n",
       " 625,\n",
       " 626,\n",
       " 627,\n",
       " 628,\n",
       " 629,\n",
       " 630,\n",
       " 631,\n",
       " 632,\n",
       " 633,\n",
       " 634,\n",
       " 635,\n",
       " 636,\n",
       " 637,\n",
       " 638,\n",
       " 639,\n",
       " 640,\n",
       " 641,\n",
       " 642,\n",
       " 643,\n",
       " 644,\n",
       " 645,\n",
       " 646,\n",
       " 647,\n",
       " 648,\n",
       " 649,\n",
       " 650,\n",
       " 651,\n",
       " 652,\n",
       " 653,\n",
       " 654,\n",
       " 655,\n",
       " 656,\n",
       " 657,\n",
       " 658,\n",
       " 659,\n",
       " 660,\n",
       " 661,\n",
       " 662,\n",
       " 663,\n",
       " 664,\n",
       " 665,\n",
       " 666,\n",
       " 667,\n",
       " 668,\n",
       " 669,\n",
       " 670,\n",
       " 671,\n",
       " 672,\n",
       " 673,\n",
       " 674,\n",
       " 675,\n",
       " 676,\n",
       " 677,\n",
       " 678,\n",
       " 679,\n",
       " 680,\n",
       " 681,\n",
       " 682,\n",
       " 683,\n",
       " 684,\n",
       " 685,\n",
       " 686,\n",
       " 687,\n",
       " 688,\n",
       " 689,\n",
       " 690,\n",
       " 691,\n",
       " 692,\n",
       " 693,\n",
       " 694,\n",
       " 695,\n",
       " 696,\n",
       " 697,\n",
       " 698,\n",
       " 699,\n",
       " 700,\n",
       " 701,\n",
       " 702,\n",
       " 703,\n",
       " 704,\n",
       " 705,\n",
       " 706,\n",
       " 707,\n",
       " 708,\n",
       " 709,\n",
       " 710,\n",
       " 711,\n",
       " 712,\n",
       " 713,\n",
       " 714,\n",
       " 715,\n",
       " 716,\n",
       " 717,\n",
       " 718,\n",
       " 719,\n",
       " 720,\n",
       " 721,\n",
       " 722,\n",
       " 723,\n",
       " 724,\n",
       " 725,\n",
       " 726,\n",
       " 727,\n",
       " 728,\n",
       " 729,\n",
       " 730,\n",
       " 731,\n",
       " 732,\n",
       " 733,\n",
       " 734,\n",
       " 735,\n",
       " 736,\n",
       " 737,\n",
       " 738,\n",
       " 739,\n",
       " 740,\n",
       " 741,\n",
       " 742,\n",
       " 743,\n",
       " 744,\n",
       " 745,\n",
       " 746,\n",
       " 747,\n",
       " 748,\n",
       " 749,\n",
       " 750,\n",
       " 751,\n",
       " 752,\n",
       " 753,\n",
       " 754,\n",
       " 755,\n",
       " 756,\n",
       " 757,\n",
       " 758,\n",
       " 759,\n",
       " 760,\n",
       " 761,\n",
       " 762,\n",
       " 763,\n",
       " 764,\n",
       " 765,\n",
       " 766,\n",
       " 767,\n",
       " 768,\n",
       " 769,\n",
       " 770,\n",
       " 771,\n",
       " 772,\n",
       " 773,\n",
       " 774,\n",
       " 775,\n",
       " 776,\n",
       " 777,\n",
       " 778,\n",
       " 779,\n",
       " 780,\n",
       " 781,\n",
       " 782,\n",
       " 783,\n",
       " 784,\n",
       " 785,\n",
       " 786,\n",
       " 787,\n",
       " 788,\n",
       " 789,\n",
       " 790,\n",
       " 791,\n",
       " 792,\n",
       " 793,\n",
       " 794,\n",
       " 795,\n",
       " 796,\n",
       " 797,\n",
       " 798,\n",
       " 799,\n",
       " 800,\n",
       " 801,\n",
       " 802,\n",
       " 803,\n",
       " 804,\n",
       " 805,\n",
       " 806,\n",
       " 807,\n",
       " 808,\n",
       " 809,\n",
       " 810,\n",
       " 811,\n",
       " 812,\n",
       " 813,\n",
       " 814,\n",
       " 815,\n",
       " 816,\n",
       " 817,\n",
       " 818,\n",
       " 819,\n",
       " 820,\n",
       " 821,\n",
       " 822,\n",
       " 823,\n",
       " 824,\n",
       " 825,\n",
       " 826,\n",
       " 827,\n",
       " 828,\n",
       " 829,\n",
       " 830,\n",
       " 831,\n",
       " 832,\n",
       " 833,\n",
       " 834,\n",
       " 835,\n",
       " 836,\n",
       " 837,\n",
       " 838,\n",
       " 839,\n",
       " 840,\n",
       " 841,\n",
       " 842,\n",
       " 843,\n",
       " 844,\n",
       " 845,\n",
       " 846,\n",
       " 847,\n",
       " 848,\n",
       " 849,\n",
       " 850,\n",
       " 851,\n",
       " 852,\n",
       " 853,\n",
       " 854,\n",
       " 855,\n",
       " 856,\n",
       " 857,\n",
       " 858,\n",
       " 859,\n",
       " 860,\n",
       " 861,\n",
       " 862,\n",
       " 863,\n",
       " 864,\n",
       " 865,\n",
       " 866,\n",
       " 867,\n",
       " 868,\n",
       " 869,\n",
       " 870,\n",
       " 871,\n",
       " 872,\n",
       " 873,\n",
       " 874,\n",
       " 875,\n",
       " 876,\n",
       " 877,\n",
       " 878,\n",
       " 879,\n",
       " 880,\n",
       " 881,\n",
       " 882,\n",
       " 883,\n",
       " 884,\n",
       " 885,\n",
       " 886,\n",
       " 887,\n",
       " 888,\n",
       " 889,\n",
       " 890,\n",
       " 891,\n",
       " 892,\n",
       " 893,\n",
       " 894,\n",
       " 895,\n",
       " 896,\n",
       " 897,\n",
       " 898,\n",
       " 899,\n",
       " 900,\n",
       " 901,\n",
       " 902,\n",
       " 903,\n",
       " 904,\n",
       " 905,\n",
       " 906,\n",
       " 907,\n",
       " 908,\n",
       " 909,\n",
       " 910,\n",
       " 911,\n",
       " 912,\n",
       " 913,\n",
       " 914,\n",
       " 915,\n",
       " 916,\n",
       " 917,\n",
       " 918,\n",
       " 919,\n",
       " 920,\n",
       " 921,\n",
       " 922,\n",
       " 923,\n",
       " 924,\n",
       " 925,\n",
       " 926,\n",
       " 927,\n",
       " 928,\n",
       " 929,\n",
       " 930,\n",
       " 931,\n",
       " 932,\n",
       " 933,\n",
       " 934,\n",
       " 935,\n",
       " 936,\n",
       " 937,\n",
       " 938,\n",
       " 939,\n",
       " 940,\n",
       " 941,\n",
       " 942,\n",
       " 943,\n",
       " 944,\n",
       " 945,\n",
       " 946,\n",
       " 947,\n",
       " 948,\n",
       " 949,\n",
       " 950,\n",
       " 951,\n",
       " 952,\n",
       " 953,\n",
       " 954,\n",
       " 955,\n",
       " 956,\n",
       " 957,\n",
       " 958,\n",
       " 959,\n",
       " 960,\n",
       " 961,\n",
       " 962,\n",
       " 963,\n",
       " 964,\n",
       " 965,\n",
       " 966,\n",
       " 967,\n",
       " 968,\n",
       " 969,\n",
       " 970,\n",
       " 971,\n",
       " 972,\n",
       " 973,\n",
       " 974,\n",
       " 975,\n",
       " 976,\n",
       " 977,\n",
       " 978,\n",
       " 979,\n",
       " 980,\n",
       " 981,\n",
       " 982,\n",
       " 983,\n",
       " 984,\n",
       " 985,\n",
       " 986,\n",
       " 987,\n",
       " 988,\n",
       " 989,\n",
       " 990,\n",
       " 991,\n",
       " 992,\n",
       " 993,\n",
       " 994,\n",
       " 995,\n",
       " 996,\n",
       " 997,\n",
       " 998,\n",
       " 999,\n",
       " 1000,\n",
       " 1001,\n",
       " 1002,\n",
       " 1003,\n",
       " 1004,\n",
       " 1005,\n",
       " 1006,\n",
       " 1007,\n",
       " 1008,\n",
       " 1009,\n",
       " 1010,\n",
       " 1011,\n",
       " 1012,\n",
       " 1013,\n",
       " 1014,\n",
       " 1015,\n",
       " 1016,\n",
       " 1017,\n",
       " 1018,\n",
       " 1019,\n",
       " 1020,\n",
       " 1021,\n",
       " 1022,\n",
       " 1023,\n",
       " 1024,\n",
       " 1025,\n",
       " 1026,\n",
       " 1027,\n",
       " 1028,\n",
       " 1029,\n",
       " 1030,\n",
       " 1031,\n",
       " 1032,\n",
       " 1033,\n",
       " 1034,\n",
       " 1035,\n",
       " 1036,\n",
       " 1037,\n",
       " 1038,\n",
       " 1039,\n",
       " 1040,\n",
       " 1041,\n",
       " 1042,\n",
       " 1043,\n",
       " 1044,\n",
       " 1045,\n",
       " 1046,\n",
       " 1047,\n",
       " 1048,\n",
       " 1049,\n",
       " 1050,\n",
       " 1051,\n",
       " 1052,\n",
       " 1053,\n",
       " 1054,\n",
       " 1055,\n",
       " ...]"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 暂存档\n",
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40,39,40,40,40,40,40,40,40,40,40,40,39,39,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,39,39,40,39,39,"
     ]
    }
   ],
   "source": [
    "def get_content(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    content_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    content_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    content_1 = ''.join(r.html.xpath(content_xpath_1))\n",
    "    content_2 = ''.join(r.html.xpath(content_xpath_2))\n",
    "    return content_1 + content_2\n",
    "\n",
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x for x in root.xpath('//div[@class=\"inner_link_article_title\"]//span[2]/text()')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    content_text = [get_content(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link, \"content_text\":content_text})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages[0:55]:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州刚刚通报！新增本土12例无症状感染者，详情公布</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委今晨（30日）通报，5月29日0-24时，全省新增境外输入确诊病例1例，广州报告...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山新增1例本土无症状感染者，详情公布</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委今早（30日）通报，5月29日0-24时，据佛山市南海区新冠肺炎防控指挥部今天（...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>广东惠州发布通知！</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月29日下午，广东惠州大亚湾区新型冠状病毒肺炎疫情防控领导小组发布通告，，对澳头街道、西区...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>他是TVB“警察专业户”，曾因撑警被抹黑，店铺被打砸</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“如果我当时没有站出来说出真实的想法，等我老了以后，我一定会为自己的懦弱感到后悔。”近期，曾...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>别刷了！朋友圈越刷越孤独！</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>很多人习惯无聊就刷朋友圈一遍一遍地刷，明知道没几个更新其实，微信上还有许多有价值的东西比如，...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>5人吸入有毒气体死亡，8人受伤</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据吉林省吉林市政府新闻办公室微博通报，2月27日21时30分，吉林化纤公司发生一起安全生产事...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>董明珠：建议月薪一万才纳税</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“今年我建议国家加快建立自主的综合市场标准认证，不管是什么产品、什么国际标准，只要进入中国市...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>“李焕英”票房超《流浪地球》，中国影史票房第三</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>作为今年电影春节档最大“赢家”，由贾玲执导、编剧、主演的电影《你好，李焕英》创造的票房神话还...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>饿了么诉美团，一审获赔100万</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>一起不正当竞争官司，揭示外卖餐饮行业两大巨头为争夺商户资源的竞争状况。2019年，饿了么因不...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2190</th>\n",
       "      <td>四川滚石砸中民房，被埋人员已无生命迹象</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>四川省峨边彝族自治县人民政府新闻办公室官方微博@今日峨边 2月27日通报，2月27日16时5...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2191 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                           title create_time  \\\n",
       "0      广州刚刚通报！新增本土12例无症状感染者，详情公布  2021-05-30   \n",
       "1            佛山新增1例本土无症状感染者，详情公布  2021-05-30   \n",
       "2                      广东惠州发布通知！  2021-05-30   \n",
       "3     他是TVB“警察专业户”，曾因撑警被抹黑，店铺被打砸  2021-05-30   \n",
       "4                  别刷了！朋友圈越刷越孤独！  2021-05-30   \n",
       "...                          ...         ...   \n",
       "2186             5人吸入有毒气体死亡，8人受伤  2021-02-28   \n",
       "2187               董明珠：建议月薪一万才纳税  2021-02-28   \n",
       "2188     “李焕英”票房超《流浪地球》，中国影史票房第三  2021-02-28   \n",
       "2189             饿了么诉美团，一审获赔100万  2021-02-28   \n",
       "2190         四川滚石砸中民房，被埋人员已无生命迹象  2021-02-28   \n",
       "\n",
       "                                                   link  \\\n",
       "0     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                 ...   \n",
       "2186  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2190  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text  \n",
       "0     据广东卫健委今晨（30日）通报，5月29日0-24时，全省新增境外输入确诊病例1例，广州报告...  \n",
       "1     据广东卫健委今早（30日）通报，5月29日0-24时，据佛山市南海区新冠肺炎防控指挥部今天（...  \n",
       "2     5月29日下午，广东惠州大亚湾区新型冠状病毒肺炎疫情防控领导小组发布通告，，对澳头街道、西区...  \n",
       "3     “如果我当时没有站出来说出真实的想法，等我老了以后，我一定会为自己的懦弱感到后悔。”近期，曾...  \n",
       "4     很多人习惯无聊就刷朋友圈一遍一遍地刷，明知道没几个更新其实，微信上还有许多有价值的东西比如，...  \n",
       "...                                                 ...  \n",
       "2186  据吉林省吉林市政府新闻办公室微博通报，2月27日21时30分，吉林化纤公司发生一起安全生产事...  \n",
       "2187  “今年我建议国家加快建立自主的综合市场标准认证，不管是什么产品、什么国际标准，只要进入中国市...  \n",
       "2188  作为今年电影春节档最大“赢家”，由贾玲执导、编剧、主演的电影《你好，李焕英》创造的票房神话还...  \n",
       "2189  一起不正当竞争官司，揭示外卖餐饮行业两大巨头为争夺商户资源的竞争状况。2019年，饿了么因不...  \n",
       "2190  四川省峨边彝族自治县人民政府新闻办公室官方微博@今日峨边 2月27日通报，2月27日16时5...  \n",
       "\n",
       "[2191 rows x 4 columns]"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ==&mid=2651045790&idx=1&sn=5d37cea76111ff0ee5bc481f0ed6098e&chksm=4794bc7070e335668a09914017153d5913573ea2b62807179767a6833f5269ddee276fc3fac7#rd'"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查链接是否正确\n",
    "df_url_out.loc[0].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>637</th>\n",
       "      <td>热搜第一，他们突然宣布离婚</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据央视消息，当地时间3日，微软联合创始人比尔·盖茨与妻子梅琳达通过联合声明，图源：@央视财经...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>638</th>\n",
       "      <td>刷爆屏！新华社都发声了</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据新华社消息，近日有网友建议，将已接种疫苗人群的绿色健康码升级为金色健康码。新华社发表评论：...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>639</th>\n",
       "      <td>生活糜烂、大搞权色钱色交易，双开！</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>4月30日下午，中央纪委国家监委网站发布消息：海南省委原常委、三亚市委原书记严重违纪违法被开...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>640</th>\n",
       "      <td>一飞往广州航班延误，男子大闹机舱致再次延误</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月2日，由的首都航空次航班，原定于19时15分起飞，因广州天气原因延误。登机后，一名乘客大...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>641</th>\n",
       "      <td>真牛！几年没洗的洗衣机，都给掏干净了</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>越来越多的人已经有了这个意识：洗衣机要定时清洁，真的很脏！为什么洗衣机用久了会脏?如果长时间...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>642</th>\n",
       "      <td>广州布碎“围村”！工作人员：“罚谁都下不了手”，检察院介入</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>近期以来，南都“记者帮”接到市民反映，广州市海珠区多个制衣城中村有大量布碎垃圾长期堆积，已严...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>643</th>\n",
       "      <td>广东14名驴友因暴雨被困深山，消防凌晨紧急出动</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>昨日（5月3日）凌晨，清远消防接到求助称，连山县禾洞镇大龙山脉有人被困石公山上。‍‍【‍N视...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>644</th>\n",
       "      <td>外婆给狗狗编辫子走红，网友：狗狗承受了我小时候的痛</td>\n",
       "      <td>2021-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月1日，辽宁沈阳一位网友晒出“外婆给狗狗编辫子”的视频走红网络。‍‍【N视频】编辑：张明悦...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             title create_time  \\\n",
       "637                  热搜第一，他们突然宣布离婚  2021-05-04   \n",
       "638                    刷爆屏！新华社都发声了  2021-05-04   \n",
       "639              生活糜烂、大搞权色钱色交易，双开！  2021-05-04   \n",
       "640          一飞往广州航班延误，男子大闹机舱致再次延误  2021-05-04   \n",
       "641             真牛！几年没洗的洗衣机，都给掏干净了  2021-05-04   \n",
       "642  广州布碎“围村”！工作人员：“罚谁都下不了手”，检察院介入  2021-05-04   \n",
       "643        广东14名驴友因暴雨被困深山，消防凌晨紧急出动  2021-05-04   \n",
       "644      外婆给狗狗编辫子走红，网友：狗狗承受了我小时候的痛  2021-05-04   \n",
       "\n",
       "                                                  link  \\\n",
       "637  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "638  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "639  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "640  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "641  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "642  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "643  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "644  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                          content_text  \n",
       "637  据央视消息，当地时间3日，微软联合创始人比尔·盖茨与妻子梅琳达通过联合声明，图源：@央视财经...  \n",
       "638  据新华社消息，近日有网友建议，将已接种疫苗人群的绿色健康码升级为金色健康码。新华社发表评论：...  \n",
       "639  4月30日下午，中央纪委国家监委网站发布消息：海南省委原常委、三亚市委原书记严重违纪违法被开...  \n",
       "640  5月2日，由的首都航空次航班，原定于19时15分起飞，因广州天气原因延误。登机后，一名乘客大...  \n",
       "641  越来越多的人已经有了这个意识：洗衣机要定时清洁，真的很脏！为什么洗衣机用久了会脏?如果长时间...  \n",
       "642  近期以来，南都“记者帮”接到市民反映，广州市海珠区多个制衣城中村有大量布碎垃圾长期堆积，已严...  \n",
       "643  昨日（5月3日）凌晨，清远消防接到求助称，连山县禾洞镇大龙山脉有人被困石公山上。‍‍【‍N视...  \n",
       "644  5月1日，辽宁沈阳一位网友晒出“外婆给狗狗编辫子”的视频走红网络。‍‍【N视频】编辑：张明悦...  "
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 找出重复部分\n",
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州刚刚通报！新增本土12例无症状感染者，详情公布</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委今晨（30日）通报，5月29日0-24时，全省新增境外输入确诊病例1例，广州报告...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山新增1例本土无症状感染者，详情公布</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委今早（30日）通报，5月29日0-24时，据佛山市南海区新冠肺炎防控指挥部今天（...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>广东惠州发布通知！</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月29日下午，广东惠州大亚湾区新型冠状病毒肺炎疫情防控领导小组发布通告，，对澳头街道、西区...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>他是TVB“警察专业户”，曾因撑警被抹黑，店铺被打砸</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“如果我当时没有站出来说出真实的想法，等我老了以后，我一定会为自己的懦弱感到后悔。”近期，曾...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>别刷了！朋友圈越刷越孤独！</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>很多人习惯无聊就刷朋友圈一遍一遍地刷，明知道没几个更新其实，微信上还有许多有价值的东西比如，...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>5人吸入有毒气体死亡，8人受伤</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据吉林省吉林市政府新闻办公室微博通报，2月27日21时30分，吉林化纤公司发生一起安全生产事...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>董明珠：建议月薪一万才纳税</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“今年我建议国家加快建立自主的综合市场标准认证，不管是什么产品、什么国际标准，只要进入中国市...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>“李焕英”票房超《流浪地球》，中国影史票房第三</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>作为今年电影春节档最大“赢家”，由贾玲执导、编剧、主演的电影《你好，李焕英》创造的票房神话还...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>饿了么诉美团，一审获赔100万</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>一起不正当竞争官司，揭示外卖餐饮行业两大巨头为争夺商户资源的竞争状况。2019年，饿了么因不...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2190</th>\n",
       "      <td>四川滚石砸中民房，被埋人员已无生命迹象</td>\n",
       "      <td>2021-02-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>四川省峨边彝族自治县人民政府新闻办公室官方微博@今日峨边 2月27日通报，2月27日16时5...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2183 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                           title create_time  \\\n",
       "0      广州刚刚通报！新增本土12例无症状感染者，详情公布  2021-05-30   \n",
       "1            佛山新增1例本土无症状感染者，详情公布  2021-05-30   \n",
       "2                      广东惠州发布通知！  2021-05-30   \n",
       "3     他是TVB“警察专业户”，曾因撑警被抹黑，店铺被打砸  2021-05-30   \n",
       "4                  别刷了！朋友圈越刷越孤独！  2021-05-30   \n",
       "...                          ...         ...   \n",
       "2186             5人吸入有毒气体死亡，8人受伤  2021-02-28   \n",
       "2187               董明珠：建议月薪一万才纳税  2021-02-28   \n",
       "2188     “李焕英”票房超《流浪地球》，中国影史票房第三  2021-02-28   \n",
       "2189             饿了么诉美团，一审获赔100万  2021-02-28   \n",
       "2190         四川滚石砸中民房，被埋人员已无生命迹象  2021-02-28   \n",
       "\n",
       "                                                   link  \\\n",
       "0     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                 ...   \n",
       "2186  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2190  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text  \n",
       "0     据广东卫健委今晨（30日）通报，5月29日0-24时，全省新增境外输入确诊病例1例，广州报告...  \n",
       "1     据广东卫健委今早（30日）通报，5月29日0-24时，据佛山市南海区新冠肺炎防控指挥部今天（...  \n",
       "2     5月29日下午，广东惠州大亚湾区新型冠状病毒肺炎疫情防控领导小组发布通告，，对澳头街道、西区...  \n",
       "3     “如果我当时没有站出来说出真实的想法，等我老了以后，我一定会为自己的懦弱感到后悔。”近期，曾...  \n",
       "4     很多人习惯无聊就刷朋友圈一遍一遍地刷，明知道没几个更新其实，微信上还有许多有价值的东西比如，...  \n",
       "...                                                 ...  \n",
       "2186  据吉林省吉林市政府新闻办公室微博通报，2月27日21时30分，吉林化纤公司发生一起安全生产事...  \n",
       "2187  “今年我建议国家加快建立自主的综合市场标准认证，不管是什么产品、什么国际标准，只要进入中国市...  \n",
       "2188  作为今年电影春节档最大“赢家”，由贾玲执导、编剧、主演的电影《你好，李焕英》创造的票房神话还...  \n",
       "2189  一起不正当竞争官司，揭示外卖餐饮行业两大巨头为争夺商户资源的竞争状况。2019年，饿了么因不...  \n",
       "2190  四川省峨边彝族自治县人民政府新闻办公室官方微博@今日峨边 2月27日通报，2月27日16时5...  \n",
       "\n",
       "[2183 rows x 4 columns]"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 找出不重复的部分\n",
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将抓取到的内容保存到本地 —— 数据输出\n",
    "with pd.ExcelWriter('{公众号}公众号链接及文章.xlsx'.format(公众号=公众号),mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer, sheet_name=公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
